#This file produces all text-based measures used in section 4 of the Cambridge Element
#Run on MacOS Monterey 12.7.4
#R version 4.3.2 (2023-10-31)

#Load packages
rm(list = ls()) #clear environment
library(stringr) #1.5.1
library(tibble) #3.2.1
library(quanteda) #3.3.1
library(quanteda.textplots) #0.94.3
library(quanteda.textmodels) #0.9.6
library(dplyr) #1.1.4
library(text2vec) #0.6.4
library(data.table) #1.14.10
library(readxl) #1.4.3


##########################################
#Loading and pre-processing speech data
##########################################

#Load speech data form the House of Commons
seven <- read.csv("./HoC Speeches/2017.csv", header = TRUE, row.names = 1)
eight <- read.csv("./HoC Speeches/2018.csv", header = TRUE, row.names = 1)
nine <- read.csv("./HoC Speeches/2019.csv", header = TRUE, row.names = 1)
blum <- rbind(seven,eight,nine)

#Get rid of debates before 2017 election
blum$hdate <- as.Date(blum$hdate)
blum <- blum[blum$hdate > as.Date("2017-06-12"),]

#Preprocess MP-party combinations (last party membership) & debate topics
partynames <- blum %>%
  dplyr:: select(name, party) %>%
  arrange(desc(row_number())) %>%
  distinct(name, .keep_all = TRUE)
rm(seven,eight,nine)
blum$parent <- gsub("oral answers to questions: ", "" ,blum$parent, fixed = F, ignore.case = T)

#Drop adminstrative/procedural debates/daily orders
blum <- blum[!grepl("Delegated Legislation|Business without Debate|Election of Speaker|Points of Order|Business of the House Commission", blum$parent, ignore.case = TRUE),]
blum <- blum[!grepl("Business of the House Commission Bill|Advisory Committee on Business Appointments|Business Before Questions|Parliamentary Business", blum$parent, ignore.case = TRUE),]
blum <- blum[!grepl("Business of the House|Backbench Business Committee|Backbench Business", blum$parent, ignore.case = TRUE),]

#Create data frame to store estimates
MP_data <- cbind.data.frame(MP = partynames$name, party = partynames$party)


########################
#Constituency mentions
########################

#Get data and turn into character and make some corrections
const <- blum
const$constituency <- as.character(const$constituency)
const$body <- as.character(const$body)
const$constituency[const$constituency == "Ynys M\303\264n"] <- "Ynys Mon"

#Collapse on MP level
const1 <- as.data.frame(aggregate(body ~ name, data = const, paste, collapse = " ")) #concatenate separate speech items by MPname 
const2 <- as.data.frame(aggregate(constituency ~ name, data = const, head, collapse = " ")) #concatenate separate speech items by MPname 
const <- merge(const1, const2, by = "name")
const$constituency <- const$constituency[,1]

#Count total number of words
const$totalnowords <- sapply(const$body, function(x) length(unlist(strsplit(as.character(x), "\\W+")))) #get total number of words per mp 

#Create different versions of constituency names
const$split_constituency <- const$constituency
const$split_constituency <- gsub("\\<North\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<South\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<West\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<East\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<City\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<of\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<Central\\>", "", const$split_constituency)
const$split_constituency <- gsub("\\<The\\>", "", const$split_constituency)
const$split_constituency <- trimws(const$split_constituency)

const$term1 <- const$split_constituency
const$term2 <- NA
const$term3 <- NA
const$term4 <- NA
const$term5 <- NA
const$term6 <- NA
const$term7 <- NA

for(i in 1:length(const$body)){
  const[i,]$term2 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[1]
  const[i,]$term3 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[2]
  const[i,]$term4 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[3]
  const[i,]$term5 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[4]
  const[i,]$term6 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[5]
  const[i,]$term7 <- unlist(strsplit(const[i,]$split_constituency, "\\W+"))[6]
}

#Implement some corrections
const$term3[const$constituency == "Ynys Mon"] <- "Mon "
const$term2[const$constituency == "Na h-Eileanan an Iar"] <- "h-Eileanan "
const$term3[const$constituency == "Na h-Eileanan an Iar"] <- "Iar "
const$term4[const$constituency == "Na h-Eileanan an Iar"] <- NA
const$term5[const$constituency == "Na h-Eileanan an Iar"] <- NA
const$term6[const$constituency == "Na h-Eileanan an Iar"] <- NA
const$term1[const$constituency == "Moray"] <- "Moray " 
const$term2[const$constituency == "Moray"] <- "Moray "
const$term3[const$constituency == "Stoke-on-Trent Central"] <- "Trent "
const$term4[const$constituency == "Stoke-on-Trent Central"] <- NA
const$term3[const$constituency == "Stoke-on-Trent North"] <- "Trent "
const$term4[const$constituency == "Stoke-on-Trent North"] <- NA
const$term3[const$constituency == "Stoke-on-Trent South"] <- "Trent "
const$term4[const$constituency == "Stoke-on-Trent South"] <- NA
const$term3[const$constituency == "Faversham and Mid Kent"] <- "Kent "
const$term4[const$constituency == "Faversham and Mid Kent"] <- NA
const$term2[const$constituency == "Cities of London and Westminster"] <- "London "
const$term3[const$constituency == "Cities of London and Westminster"] <- "Westminster"
const$term4[const$constituency == "Cities of London and Westminster"] <- NA
const$term3[const$constituency == "Stratford-on-Avon"] <- "Avon "
const$term4[const$constituency == "Stratford-on-Avon"] <- NA
const$term3[const$constituency == "Ashton-under-Lyne"] <- "Lyne "
const$term4[const$constituency == "Ashton-under-Lyne"] <- NA
const$term3[const$constituency == "Berwick-upon-Tweed"] <- "Tweed "
const$term4[const$constituency == "Berwick-upon-Tweed"] <- NA
const$term3[const$constituency == "Newcastle upon Tyne North"] <- "Tyne "
const$term4[const$constituency == "Newcastle upon Tyne North"] <- NA
const$term3[const$constituency == "Newcastle upon Tyne East"] <- "Tyne "
const$term4[const$constituency == "Newcastle upon Tyne East"] <- NA
const$term3[const$constituency == "Newcastle upon Tyne Central"] <- "Tyne "
const$term4[const$constituency == "Newcastle upon Tyne Central"] <- NA
const$term3[const$constituency == "Newcastle under Lyme"] <- "Lyme "
const$term4[const$constituency == "Newcastle under Lyme"] <- NA
const$term3[const$constituency == "Kingston upon Hull North"] <- "Hull "
const$term4[const$constituency == "Kingston upon Hull North"] <- NA
const$term1[const$constituency == "Kingston upon Hull West and Hessle"] <- "Kingston upon Hull West and Hessle"
const$term3[const$constituency == "Kingston upon Hull West and Hessle"] <- "Hull "
const$term4[const$constituency == "Kingston upon Hull West and Hessle"] <- "Hessle"
const$term5[const$constituency == "Kingston upon Hull West and Hessle"] <- NA
const$term6[const$constituency == "Kingston upon Hull West and Hessle"] <- NA
const$term3[const$constituency == "Kingston upon Hull East"] <- "Hull "
const$term4[const$constituency == "Kingston upon Hull East"] <- NA
const$term1[const$constituency == "Mid Dorset and North Poole"] <- "Mid Dorset and North Poole"
const$term2[const$constituency == "Mid Dorset and North Poole"] <- "Dorset"
const$term3[const$constituency == "Mid Dorset and North Poole"] <- "Poole"
const$term4[const$constituency == "Mid Dorset and North Poole"] <- NA
const$term2[const$constituency == "Mid Bedfordshire"] <- "Bedfordshire"
const$term3[const$constituency == "Mid Bedfordshire"] <- NA
const$term1[const$constituency == "Isle of Wight"] <- "Isle of Wight" 
const$term1[const$constituency == "Vale of Glamorgan"] <- "Vale of Glamorgan" 
const$term1[const$constituency == "Vale of Clwyd"] <- "Vale of Clwyd" 
const$term1[const$constituency == "Linlithgow and East Falkirk"] <- "Linlithgow and East Falkirk" #inner "South"/"West" etc.
const$term1[const$constituency == "Arundel and South Downs"] <- "Arundel and South Downs"
const$term1[const$constituency == "Oldham East and Saddleworth"] <- "Oldham East and Saddleworth"
const$term1[const$constituency == "Edinburgh North and Leith"] <- "Edinburgh North and Leith"
const$term1[const$constituency == "Carmarthen East and Dinefwr"] <- "Carmarthen East and Dinefwr"
const$term1[const$constituency == "Blackpool North and Cleveleys"] <- "Blackpool North and Cleveleys"
const$term1[const$constituency == "Harwich and North Essex"] <- "Harwich and North Essex"
const$term1[const$constituency == "Hereford and South Herefordshire"] <- "Hereford and South Herefordshire"
const$term1[const$constituency == "Cities of London and Westminster"] <- "Cities of London and Westminster"
const$term1[const$constituency == "South Holland and The Deepings"] <- "South Holland and The Deepings"
const$term1[const$constituency == "Carmarthen West and South Pembrokeshire"] <- "Carmarthen West and South Pembrokeshire"
const$term1[const$constituency == "Uxbridge and South Ruislip"] <- "Uxbridge and South Ruislip"
const$term1[const$constituency == "Hackney North and Stoke Newington"] <- "Hackney North and Stoke Newington"
const$term1[const$constituency == "Penrith and The Border"] <- "Penrith and The Border"
const$term1[const$constituency == "Islington South and Finsbury"] <- "Islington South and Finsbury"
const$term1[const$constituency == "Torridge and West Devon"] <- "Torridge and West Devon"
const$term1[const$constituency == "Dunfermline and West Fife"] <- "Dunfermline and West Fife"
const$term1[const$constituency == "Perth and North Perthshire"] <- "Perth and North Perthshire"
const$term1[const$constituency == "Maidstone and The Weald"] <- "Maidstone and The Weald"
const$term1[const$constituency == "Lewisham West and Penge"] <- "Lewisham West and Penge"
const$term1[const$constituency == "Central Suffolk and North Ipswich"] <- "Central Suffolk and North Ipswich"
const$term1[const$constituency == "St Helens South and Whiston"] <- "St Helens South and Whiston"
const$term1[const$constituency == "Sleaford and North Hykeham"] <- "Sleaford and North Hykeham"
const$term1[const$constituency == "Liverpool, West Derby"] <- "Liverpool, West Derby"
const$term1[const$constituency == "Ealing Central and Acton"] <- "Ealing Central and Acton"
const$term1[const$constituency == "Ochil and South Perthshire"] <- "Ochil and South Perthshire"
const$term1[const$constituency == "Dulwich and West Norwood"] <- "Dulwich and West Norwood"
const$term1[const$constituency == "Middlesbrough South and East Cleveland"] <- "Middlesbrough South and East Cleveland"
const$term1[const$constituency == "Oxford West and Abingdon"] <- "Oxford West and Abingdon"
const$term1[const$constituency == "Bridgwater and West Somerset"] <- "Bridgwater and West Somerset"

const$term1[const$term1 == "and"] <- NA
const$term2[const$term2 == "and"] <- NA
const$term3[const$term3 == "and"] <- NA
const$term4[const$term4 == "and"] <- NA
const$term5[const$term5 == "and"] <- NA
const$term6[const$term6 == "and"] <- NA
const$term7[const$term7 == "and"] <- NA

const$term1[const$term1 == "the"] <- NA
const$term2[const$term2 == "the"] <- NA
const$term3[const$term3 == "the"] <- NA
const$term4[const$term4 == "the"] <- NA
const$term5[const$term5 == "the"] <- NA
const$term6[const$term6 == "the"] <- NA
const$term7[const$term7 == "the"] <- NA

const$term1[const$term1 == "St"] <- NA
const$term2[const$term2 == "St"] <- NA
const$term3[const$term3 == "St"] <- NA
const$term4[const$term4 == "St"] <- NA
const$term5[const$term5 == "St"] <- NA
const$term6[const$term6 == "St"] <- NA
const$term7[const$term7 == "St"] <- NA

#Count number of constitutive search terms
const$search_terms <- 6 - apply(const[,7:12], 1, function(x) sum(is.na(x)))

#Count mentions of constituency names, "constituency" and "constituents"
const$mentions_constituency <- 0
const$mentions_generic_constituency <- 0
const$mentions_constituents <- 0

for(i in 1:length(const$body)){
  const[i,]$mentions_constituency <- str_count(const[i,]$body, pattern = const[i,]$term1) #is case-sensitive with wildcard
  if(!is.na(const[i,]$term2)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term2)
  }
  if(!is.na(const[i,]$term3)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term3)
  }
  if(!is.na(const[i,]$term4)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term4)
  }
  if(!is.na(const[i,]$term5)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term5)
  }
  if(!is.na(const[i,]$term6)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term6)
  }
  if(!is.na(const[i,]$term7)){
    const[i,]$mentions_constituency <- const[i,]$mentions_constituency + str_count(const[i,]$body, pattern = const[i,]$term7)
  }
  const[i,]$mentions_constituency <- const[i,]$mentions_constituency - (str_count(const[i,]$body, pattern = const[i,]$term1) * const[i,]$search_terms) #subtracts mentions where the entire constituency name is mentioned 
  const[i,]$mentions_generic_constituency <- str_count(const[i,]$body, pattern = "constituency")
  const[i,]$mentions_constituents <- str_count(const[i,]$body, pattern = "constituent")
}

#Correlation between the three forms of constituency focus
cor(const$mentions_constituency, const$mentions_generic_constituency)
cor(const$mentions_constituency, const$mentions_constituents)
cor(const$mentions_generic_constituency, const$mentions_constituents)

#Construct measure
const$local_focus <- (const$mentions_constituency + const$mentions_generic_constituency + const$mentions_constituents) / const$totalnowords * 100 #Percentage of words devoted referring to constituency

#Assign to data
merge <- const %>% select(name, constituency, local_focus, mentions_constituency, mentions_generic_constituency, mentions_constituents, totalnowords)
MP_data <- merge(MP_data, merge, by.x = "MP", by.y = "name")


#############################################
#Wordscores models for left-right positions
#############################################

#Descriptive statistics about the data
debatetopics <- blum %>%
  add_column(topic = NA) %>% #add topic column 
  dplyr:: select(topic, parent, hdate) %>%
  distinct() %>%#there are 3228 debates
  rename(title = parent)

#How many speech interventions?
speechitems <- blum %>%
  dplyr::select(name,hdate,body, parent) %>%
  rename(date=hdate, title = parent)

#Set some reference scores, the rest will be all NA, group membership as of 11/03/2021
blum$reference_score <- 0

#Left-right, economic, values
#Negative scores: some members of the Socialist Campaign Group 
blum$reference_score[blum$name == "John Martin McDonnell"] <- -1
blum$reference_score[blum$name == "Richard Burgon"] <- -1
blum$reference_score[blum$name == "Lloyd Russell-Moyle"] <- -1
blum$reference_score[blum$name == "Rebecca Long-Bailey"] <- -1
blum$reference_score[blum$name == "Jeremy Corbyn"] <- -1
blum$reference_score[blum$name == "Ian Lavery"] <- -1
blum$reference_score[blum$name == "Emma Dent Coad"] <- -1
blum$reference_score[blum$name == "Karen Lee"] <- -1
blum$reference_score[blum$name == "Laura Pidcock"] <- -1
blum$reference_score[blum$name == "Danielle Rowley"] <- -1
blum$reference_score[blum$name == "Dennis Skinner"] <- -1
blum$reference_score[blum$name == "Laura Smith"] <- -1
blum$reference_score[blum$name == "Ronnie Campbell"] <- -1
blum$reference_score[blum$name == "Diane Abbott"] <- -1
blum$reference_score[blum$name == "Dan Carden"] <- -1
blum$reference_score[blum$name == "Marsha de Cordova"] <- -1
blum$reference_score[blum$name == "Imran Hussain"] <- -1
blum$reference_score[blum$name == "Ian Lavery"] <- -1
blum$reference_score[blum$name == "Clive Lewis"] <- -1
blum$reference_score[blum$name == "Rachael Maskell"] <- -1
blum$reference_score[blum$name == "Andy McDonald"] <- -1
blum$reference_score[blum$name == "John Martin McDonnell"] <- -1
blum$reference_score[blum$name == "Ian Mearns"] <- -1
blum$reference_score[blum$name == "Grahame Morris"] <- -1
blum$reference_score[blum$name == "Kate Osamor"] <- -1
blum$reference_score[blum$name == "Jon Trickett"] <- -1

#Positive scores: some members of the Cornerstone Group 
blum$reference_score[blum$name == "Edward Leigh"] <- 1
blum$reference_score[blum$name == "Sir David Amess"] <- 1
blum$reference_score[blum$name == "Bill Cash"] <- 1
blum$reference_score[blum$name == "John Redwood"] <- 1
blum$reference_score[blum$name == "John Whittingdale"] <- 1
blum$reference_score[blum$name == "Christopher Chope"] <- 1
blum$reference_score[blum$name == "Owen Paterson"] <- 1
blum$reference_score[blum$name == "Laurence Robertson"] <- 1
blum$reference_score[blum$name == "Ian Liddell-Grainger"] <- 1
blum$reference_score[blum$name == "Greg Knight"] <- 1
blum$reference_score[blum$name == "Andrew Rosindell"] <- 1
blum$reference_score[blum$name == "Peter Bone"] <- 1
blum$reference_score[blum$name == "Stephen Crabb"] <- 1
blum$reference_score[blum$name == "David Davies"] <- 1
blum$reference_score[blum$name == "Philip Davies"] <- 1
blum$reference_score[blum$name == "Nadine Dorries"] <- 1
blum$reference_score[blum$name == "Robert Goodwill"] <- 1
blum$reference_score[blum$name == "Greg Hands"] <- 1
blum$reference_score[blum$name == "Philip Hollobone"] <- 1
blum$reference_score[blum$name == "Adam Holloway"] <- 1
blum$reference_score[blum$name == "David Jones"] <- 1
blum$reference_score[blum$name == "Daniel Kawczynski"] <- 1
blum$reference_score[blum$name == "Charles Walker"] <- 1
blum$reference_score[blum$name == "Nigel Adams"] <- 1
blum$reference_score[blum$name == "Steven Baker"] <- 1
blum$reference_score[blum$name == "Fiona Bruce"] <- 1
blum$reference_score[blum$name == "Robert Halfon"] <- 1
blum$reference_score[blum$name == "Sajid Javid"] <- 1
blum$reference_score[blum$name == "Kwasi Kwarteng"] <- 1
blum$reference_score[blum$name == "Jacob Rees-Mogg"] <- 1
blum$reference_score[blum$name == "Martin Vickers"] <- 1
blum$reference_score[blum$name == "John Hayes"] <- 1

#Laver-Garry Economic/Values dictionary (categories for state intervention and values)  - deleted house, inserted housing and households instead (found that all debate titles that include the word "House" meant the HOC)
#Changed "partnership" into "economic partnership" because all other debates pertained to "civil partnership"
#changed "inter_racial" to "inter-racial" or "interracial"

#Subset to debates that have LG terms in their title
#Left-right (economic + values)
left_right <- blum[grepl("\\<ACCOMMODATION\\>|\\<AGE\\>|\\<AMBULANCE\\>|\\<ASSIST\\>|\\<BENEFIT\\>|\\<CARE\\>|\\<CARER\\>|\\<CHILD\\>|\\<CLASS\\>|\\<CLASSES\\>|\\<CLINICS\\>|\\<COLLECTIVE|\\<CONTRIBUTION|\\<COOPERATIVE|\\<CO-OPERATIVE\\>|\\<DEPRIVATION\\>|\\<DISABILITIES\\>|\\<DISADVANTAGED\\>|\\<EDUCAT|\\<ELDERLY\\>|\\<EQUAL|\\<ESTABLISH\\>|\\<FAIR|\\<GUARANTEE|\\<HARDSHIP\\>|\\<HEALTH|\\<HOMELESS|\\<HOSPITAL|\\<HUNGER\\>|\\<INEQUAL|\\<INVEST\\>|\\<INVESTING\\>|\\<INVESTMENT\\>|\\<MEANS-TEST|\\<NURSE|\\<PATIENTS\\>|\\<PENSION\\>|\\<POOR\\>|\\<POORER\\>|\\<POOREST\\>|\\<POVERTY\\>|\\<REHOUSE|\\<RE-HOUSE|\\<SCHOOL\\>|\\<TEACH|\\<TRANSPORT\\>|\\<UNDERFUND|\\<UNEMPLOY|\\<VULNERABLE\\<\\>|WIDOW|\\<ACCOUNTANT\\>|\\<ACCOUNTING\\>|\\<ACCOUNTS\\>|\\<ADVERT|\\<AIRLINE|\\<AIRPORT|\\<AUDIT|\\<BANK|\\<BARGAINING\\>|\\<BREADWINNER|\\<BUDGET|\\<BUY|\\<CARTEL|\\<CASH|\\<CHARGE|\\<COMMERCE|\\<COMPENSAT|\\<CONSUM|\\<COST|\\<CREDIT|\\<CUSTOMER|\\<DEBT|\\<DEFICIT|\\<DWELLING|\\<EARN|\\<ECON|\\<ELECTRICITY\\>|\\<ESTATE|\\<EXPORT|\\<FEE\\>|\\<FEES\\>|\\<FINANC|\\<HOUSING\\>|\\<HOUSEHOLDS\\>|\\<IMPORT\\>|\\<IMPORTS\\>|\\<INDUSTR|\\<JOBS\\>|\\<LEASE|\\<LOAN|\\<MANUFACTUR|\\<MORTGAGE|\\<NEGOTIAT|\\<OPPORTUNITY\\>|\\<economic PARTNERSHIP\\>|\\<PASSENGER|\\<PAY|\\<PERFORMANCE\\>|\\<PORT|\\<PRODUCTIVITY\\>|\\<PROFESSION|\\<PURCHAS|\\<RAILWAY|\\<REBATE|\\<RECESSION|\\<RESEARCH|\\<REVENUE|\\<SALAR|\\<SELL|\\<SETTLEMENT\\>|\\<SOFTWARE\\>|\\<SUPPLIER|\\<SUPPLY\\>|\\<TELECOM|\\<TELEPHON|\\<TENAN|\\<TOURIS|\\<TRADE\\>|\\<TRAIN|\\<WAGE|\\<WELFARE\\>|\\<WORK|\\<ASSETS\\>|\\<AUTONOMY\\>|\\<BARRIER|\\<BID\\>|\\<BIDDERS\\>|\\<BIDDING\\>|\\<BURDEN|\\<CHARIT|\\<CHOICE|\\<COMPET|\\<CONFIDENCE\\>|\\<CONFISCATORY\\>|\\<CONSTRAIN|\\<CONTRACTING|\\<CONTRACTOR|\\<CONTROLLED\\>|\\<CONTROLLING\\>|\\<CONTROLS\\>|\\<CORPORATE\\>|\\<CORPORATION|\\<DEREGULATING\\>|\\<DISMANTL|\\<ENTREPRENEUR|\\<EXPENSIVE\\>|\\<FLEXIB|\\<FRANCHISE|\\<FUNDHOLD|\\<FUND-HOLDING\\>|\\<HOMESTEAD|\\<INITIATIVE\\>|\\<INTRUSIVE\\>|\\<INVESTOR|\\<LIBERALI|\\<MARKET|\\<MONETARY\\>|\\<MONEY\\>|\\<OWN|\\<PRIVATE\\>|\\<PRIVATELY\\>|\\<PRIVATISATIONS\\>|\\<PRIVATISED\\>|\\<PRIVATISING\\>|\\<PRODUCE|\\<PROFITABLE\\>|\\<REGULAT|\\<RETAIL|\\<RISK\\>|\\<RISKS\\>|\\<SAVINGS\\>|\\<SELL|\\<SHARES\\>|\\<SIMPLIF|\\<SPEND|\\<SPONSORSHIP\\>|\\<TAXABLE\\>|\\<TAXES\\>|\\<TAX-FREE\\>|\\<THRIFT|\\<TRADING\\>|\\<VALUE\\>|\\<VOLUNT|\\<VOUCHER|\\<DEFEND\\>|\\<DEFENDED\\>|\\<DEFENDING\\>|\\<DISCIPLINE\\>|\\<GLORIES\\>|\\<GLORIOUS\\>|\\<GRAMMAR\\>|\\<HERITAGE\\>|\\<HISTOR|\\<HNOUR|\\<IMMIGRA|\\<INHERIT|\\<INTEGRITY\\>|\\<JUBILEE|\\<LEADER|\\<MAINTAIN\\>|\\<MAJESTY\\>|\\<MARRIAGE\\>|\\<OBSCEN|\\<PAST\\>|\\<PORNOGRAPH|\\<PRESERV|\\<PRIDE\\>|\\<PRINCIPL|\\<PROBITY\\>|\\<PROFESSIONALISM\\>|\\<PROUD\\>|\\<PUNCTUAL|\\<RECAPTURE|\\<RELIAB|\\<THREAT|\\<TRADITION|\\<CRUEL|\\<DISCRIMINAT|\\<INJUSTICE|\\<INNOCENT\\>|\\<INTER-RACIAL\\>|\\<INTERRACIAL\\>|\\<MINORIT|\\<REPRESSI|\\<RIGHTS\\>|\\<SEX", blum$parent, ignore.case = T),]
View(list(unique(left_right$parent))) #Check face validity

#Collapse speeches by MP, so that each MP represents one document in the corpus 
left_right <- as.data.frame(cbind(aggregate(body ~ name, data = left_right, paste, collapse = " "), aggregate(reference_score ~ name, data = left_right, mean))) #concatenate separate speech items by MP name 
left_right <- left_right[-c(3)]
left_right$reference_score[left_right$reference_score == 0] <- NA

#Remove MPs that spent less than 200 words on the dictionary words, because that is insufficient to estimate their position
left_right <- left_right[sapply(left_right$body, function(x) length(unlist(strsplit(as.character(x), "\\W+")))) > 200,]

#Create corpora
left_right <- corpus(left_right$body, docvars = data.frame(MP = left_right$name, reference_score = left_right$reference_score))

#Build DFMs, removing stopwords, no stemming
dfm_left_right <- dfm(left_right, tolower = TRUE, remove = stopwords(language = "en"), remove_punct = TRUE)

#Estimate Wordscores models: left-right
rownames(dfm_left_right) <- dfm_left_right$MP
wordscores_left_right <- quanteda.textmodels::textmodel_wordscores(dfm_left_right,
                                                          dfm_left_right$reference_score,
                                                          scale = "linear",
                                                          smooth = 0)
summary(wordscores_left_right)
sort(coef(wordscores_left_right))
estimates_left_right <- predict(wordscores_left_right, se.fit = TRUE, newdata = dfm_left_right)
textplot_scale1d(estimates_left_right) #Face validity: plotting MPs' positions
#Face validity: Guy Opperman as one of the most left-wing Conservatives, David Crausby as most right-wing Labour MP, also Derek Twigg (both voted against same-sex marriage), moreover Angela Smith

#Assign to data
merge <- cbind.data.frame(WS_left_right = estimates_left_right$fit, WS_left_right_se = estimates_left_right$se.fit, MP = row.names(dfm_left_right))
MP_data <- merge(MP_data, merge, by.x = "MP", by.y = "MP", all.x = T)


###################
#Female word usage
###################

#Get all data 
female <- blum

#Collapse on MP level
female <- as.data.frame(aggregate(body ~ name, data = female, paste, collapse = " ")) #concatenate separate speech items by MPname 

#Create corpus
female <- corpus(female$body, docvars = data.frame(MP = female$name))

#Create dictionary of female words
female_dict <- dictionary(list(female= c("aunt*", "bachelorette", "bride*", "chick", "chick'*", "chicks",
                                         "cowgirl*", "dame*", "damsel*", "daughter*", "duchess*", "exgf*", "exgirl*",
                                         "exwife*", "exwive*", "female", "females",  "feminine",  "femininity",
                                         "femme*", "fiancee*",  "gal",  "gals", "gentlewom*", "gf*", "girl", "girl's",
                                         "girlh*",  "girli*",  "girls*",  "girly",  "goddess*",  "godmother",  "granddau*",
                                         "grandm*",  "granny",  "grl*",  "gurl*",  "her",  "heroine*",  "hers",  "herself",
                                         "housewi*",  "ladies",  "lady",  "lady's",  "lass",  "lassie",  "lesbian*",  "ma",
                                         "ma'am",  "ma's", "maam",  "madam",  "madame*",  "mademoiselle*",  "maid",  "maid's",
                                         "maiden",  "maids",  "mam",  "mama",  "maternal*",  "maternity",  "matriarch*",  "milf*",
                                         "mimi",  "mimi'*",  "mimis",  "missus",  "mistres*",  "mom",  "mom's",  "momma*",  
                                         "mommy*",  "moms",  "mother",  "mother's",  "mothered",  "motherhood",  "mothering",  
                                         "motherl*", "mothers", "mrs",  "ms",  "mum",  "mum's",  "mummy*",  "mums",  "nana",
                                         "nana's",  "niece*",  "princess*",  "queen",  "queen's",  "queens",  "schoolgirl*",  
                                         "senora",  "senorita",  "she",  "she'd",  "she'll",  "she's",  "she-*",  "sis",
                                         "sister*", "sororit*",  "step-dau*", "step-moth*",  "stepdau*",  "stepmoth*", "sugarmam*",
                                         "sugarmom*", "tomboy*", "wench*", "wife*", "witch*", "wive*", "woman", "woman's",  
                                         "womanhood",  "womanly",  "womans", "women*")))
                          
#Create DFM
dfm_female <- dfm(female, tolower = TRUE, remove_punct = TRUE)
rownames(dfm_female) <- dfm_female$MP

#Count women words                          
female_words <- dfm_lookup(dfm_female, female_dict, valuetype = "glob")
#High face validity: virtually all MPs at the top are women and all at the bottom are men; Jeremy Corbyn is the male MP who use the most female words

#Assign to data
merge <- cbind.data.frame(women_words = female_words@x, MP = row.names(female_words))
MP_data <- merge(MP_data, merge, by.x = "MP", by.y = "MP", all.x = T)
MP_data$women_words_perc <- MP_data$women_words / MP_data$totalnowords * 100

#Remove from envrionment
rm(female)


#####################################
#Word embeddings for justification
#####################################

#Load seed dictionaries
partwords <- data.frame(words = unlist(str_split(readLines("./Justification model/partseeddict.txt"), pattern = ",")))
socwords <-  data.frame(words = unlist(str_split(readLines("./Justification model/societalseeddict.txt"), pattern = ",")))
socwords[,1] <- gsub('^ ','',socwords[,1]) #remove blank space if it is the first character
partwords[,1] <- gsub('^ ','',partwords[,1]) #remove blank space if it is the first character
socwords[,1] <- gsub('^ ','',socwords[,1]) #repeat
partwords[,1] <- gsub('^ ','',partwords[,1]) #repeat

#Create original versions of compounded terms
tocompound <- rbind(data.frame(words = socwords[socwords$words  %like% "_",]), data.frame(words = partwords[partwords$words  %like% "_",]))
tocompound <- rbind(tocompound, data.frame(words = socwords[socwords$words  %like% "-",]), data.frame(words = partwords[partwords$words  %like% "-",]))
tocompound$sepwords <- gsub("_", " ", tocompound$words)

#Prepare corpus
corpus_we <- blum %>% mutate(row=row_number()) #realign row names
colnames(corpus_we)[[20]] <- "text" #rename column with speeches
corpus_we$text <- as.character(corpus_we$text) #transform texts to character
corpus_we$original <- corpus_we$text #save original text
for (i in 1:nrow(tocompound)) { #replace combined terms with multi-grams, this takes a few minutes 
  corpus_we$text <- gsub(tocompound$sepwords[i], tocompound$words[i], corpus_we$text, ignore.case = T) 
}
corpus_we$text <- gsub("(?!_)[[:punct:]]", " ", corpus_we$text, perl = TRUE) #remove punctuation except for "_"
corpus_we$text <- tolower(corpus_we$text) #lowercasing
stopwords_regex = paste(tm::stopwords("SMART"), collapse = '\\b|\\b')
stopwords_regex = paste0('\\b', stopwords_regex, '\\b')
corpus_we$text <- stringr::str_replace_all(corpus_we$text, stopwords_regex, '') #remove English stopwords
corpus_we$text <- gsub(" s ", " ", corpus_we$text) #remove "s" apostrophe
corpus_we$text <- str_squish(corpus_we$text) #strip whitespace

#Save pre-processed corpus
corpus_we_save <- corpus_we

#Tokenize and create vocabulary
tokens_we <- space_tokenizer(corpus_we$text, sep = " ") #use space tokenizer
tokens_we = itoken(tokens_we, ids = as.integer(rownames(corpus_we)), progressbar = T) #tokenizer
voc <- create_vocabulary(tokens_we) #create vocabulary
voc <- prune_vocabulary(voc, term_count_min = 10) #terms that occur at least 10 times across the entire corpus
vectorizer <- vocab_vectorizer(voc) #create a vectorizer that maps terms onto indices

#Construct term co-occurrence matrix
tcm <- create_tcm(tokens_we, vectorizer, skip_grams_window = 10, skip_grams_window_context = "symmetric") #window of 10 to either side

#Estimate GloVe model
set.seed(2143269)
glove <- GlobalVectors$new(rank = 300, x_max = 10, learning_rate = 0.1) #estimate J=300 dimensions
word_embed <- glove$fit_transform(tcm, n_iter = 100, convergence_tol = 0.001, n_threads = 8) #run model
#Get word embeddings for context words
word_embed_context <- glove$components
word_embed_total <- word_embed + t(word_embed_context) #add word vectors of target word and context as suggested in GloVe paper

#Save workspace
save.image("./Created auxiliary datasets/textmodels_after_GloVe.Rdata")

#Check face validity
woman <- word_embed_total["woman", , drop = F] #word embedding of woman
cos_sim_woman <-  sim2(x = word_embed_total, y = woman, method = "cosine", norm = "l2") #L2 norm regularization: shrink least powerful coefficients to zero
head(sort(cos_sim_woman[,1], decreasing = T), 10) #words most similar to woman

society <- word_embed_total["society", , drop = F] #word embedding of society
cos_sim_society<-  sim2(x = word_embed_total, y = society, method = "cosine", norm = "l2") #L2 norm regularization: shrink least powerful coefficients to zero
head(sort(cos_sim_society[,1], decreasing = T), 10) #words most similar to society

#Apply pre-processing steps to seed dictionary terms
partwords$words <- gsub(" ", "",partwords$words) #remove whitespace
partwords$words <- gsub("-", "",partwords$words) #remove hyphenation
socwords$words <- gsub(" ", "",socwords$words) #remove whitespace
socwords$words <- gsub("-", "",socwords$words) #remove hypthenation
partwords$words <- tolower(partwords$words) #lowercasing
socwords$words <- tolower(socwords$words) #lowercasing

#Identify dictionary terms 
terms <- data.frame(terms = dimnames(word_embed_total)[[1]]) #extract all the terms
terms$index <- rownames(terms)
names(terms)[1] <- "words"
socindex <- merge(terms, socwords, by = "words") #identifies indices of republican terms - only 32 terms in corpus
socindex$index <- as.numeric(socindex$index)
partindex <- merge(terms, partwords, by = "words") #identifies indices of pluralist terms - only 188 terms in corpus
partindex$index <- as.numeric(partindex$index)

#Calculate mean word vector of the seed dictionary terms
mean_republican <-  word_embed_total[socindex$index, , drop = F]
mean_republican <- t(as.matrix(data.frame(mean_rep = colMeans(mean_republican))))
mean_pluralist <-  word_embed_total[partindex$index, , drop = F]
mean_pluralist <- t(as.matrix(data.frame(mean_plu = colMeans(mean_pluralist))))

#Calculate cosine similarity of mean word vectors to word vectors of all other words in the dictionary
similarity_republican <-  sim2(x = word_embed_total, y = mean_republican, method = "cosine", norm = "l2") #make sure to write in research paper that L2 regularization is used, a method that shrinks the least powerful coefficients to zero 
head(sort(similarity_republican[,1], decreasing = T), 20) #show top republican terms
similarity_pluralist <-  sim2(x = word_embed_total, y = mean_pluralist, method = "cosine", norm = "l2") #make sure to write in research paper that L2 regularization is used, a method that shrinks the least powerful coefficients to zero 
head(sort(similarity_pluralist[,1], decreasing = T), 20) #show top pluralist terms

#Calculate scores using the sigmoid function
scores_republican  <- data.frame(scores_rep = 1/(1 + exp(-20*(similarity_republican-0.35)))) 
range(scores_republican)
scores_pluralist  <- data.frame(scores_plu = 1/(1 + exp(-15*(similarity_pluralist-0.15)))) 
range(scores_pluralist)

#Collapsing all speeches made by an MP
corpus_aux <- corpus(corpus_we$text, docvars = data.frame(MP = corpus_we$name, constituency = corpus_we$constituency, original_text = corpus_we$original)) #move corpus to quanteda
dfm_we <- dfm(tokens(corpus_aux)) #create dfm in quanteda
dfm_we <- dfm_group(dfm_we, groups = MP, force = TRUE) #merge speeches by MPs ("force" must be used as weighted dfms cannot be grouped without it)
dfm_we <- dfm_tfidf(dfm_we) #weight dfm by tf-idf 
dfm_we <- as.data.frame(as.matrix(dfm_we)) #convert to data frame
rm(corpus_aux)

#Append the scores to the dfm
scores_republican <- as.data.frame(t(scores_republican)) #transpose scores vector
scores_pluralist <- as.data.frame(t(scores_pluralist)) #transpose scores vector

dfm_we <- dplyr::bind_rows(dfm_we, scores_republican) #append scores vector
dfm_we <- dplyr::bind_rows(dfm_we, scores_pluralist) #append scores vector
dfm_we <- as.data.frame(nafill(dfm_we, fill = 0), row.names = rownames(dfm_we), col.names = colnames(dfm_we)) #replace all NAs with 0

#Calculate scores
scores_mean_rep <- rowSums(sweep(data.matrix(dfm_we[1:(dim(dfm_we)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_we[rownames(dfm_we) == "mean_rep",])), "*"), na.rm = TRUE) / rowSums(dfm_we[1:(dim(dfm_we)[1]-2),], na.rm = TRUE)
scores_mean_plu <- rowSums(sweep(data.matrix(dfm_we[1:(dim(dfm_we)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_we[rownames(dfm_we) == "mean_plu",])), "*"), na.rm = TRUE) / rowSums(dfm_we[1:(dim(dfm_we)[1]-2),], na.rm = TRUE)
scores_rep_plu_scale <- scores_mean_rep - scores_mean_plu

#Assign to data
merge <- cbind.data.frame(scores_mean_rep, scores_mean_plu, scores_rep_plu_scale, MP = names(scores_mean_rep))
MP_data <- merge(MP_data, merge, by.x = "MP", by.y = "MP", all.x = T)

#Identify extreme texts
#Prepare tf-idf weighted dfm
set.seed(35451)
corpus_aux <- sample_n(corpus_we, 10000) #sample 10,000 texts
corpus_aux <- corpus(corpus_aux$text, docvars = data.frame(MP = corpus_aux$name, constituency = corpus_aux$constituency, original_text = corpus_aux$original)) #move corpus to quanteda
dfm_we <- dfm(tokens(corpus_aux)) #create dfm in quanteda
dfm_we <- as.data.frame(as.matrix(dfm_we)) #convert to data frame

#Append the scores to the dfm
dfm_we <- dplyr::bind_rows(dfm_we, scores_republican) #append scores vector
dfm_we <- dplyr::bind_rows(dfm_we, scores_pluralist) #append scores vector
dfm_we <- as.data.frame(nafill(dfm_we, fill = 0), row.names = rownames(dfm_we), col.names = colnames(dfm_we)) #replace all NAs with 0

#Calculate scores
scores_mean_rep_texts <- rowSums(sweep(data.matrix(dfm_we[1:(dim(dfm_we)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_we[rownames(dfm_we) == "mean_rep",])), "*"), na.rm = TRUE) / rowSums(dfm_we[1:(dim(dfm_we)[1]-2),], na.rm = TRUE)
scores_mean_plu_texts <- rowSums(sweep(data.matrix(dfm_we[1:(dim(dfm_we)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_we[rownames(dfm_we) == "mean_plu",])), "*"), na.rm = TRUE) / rowSums(dfm_we[1:(dim(dfm_we)[1]-2),], na.rm = TRUE)
scores_rep_plu_scale_texts <- scores_mean_rep_texts - scores_mean_plu_texts

#Save example texts with scores
example_texts_we <- cbind.data.frame(scores_mean_rep_texts, scores_mean_plu_texts, scores_rep_plu_scale_texts, speech = corpus_aux$original_text, MP = corpus_aux$MP)
rm(corpus_aux)


################
#Export data
################

write.csv(MP_data, "./Created auxiliary datasets/text_models_MPs.csv")


#########################################
#Validation of word embeddings: expert
#########################################

#Split corpus into sentences
corpus_we_sentences <- corpus(gsub("hon.", "hon", as.character(blum$body)), docvars = data.frame(MP = blum$name)) #replace "hon." with "hon"
corpus_we_sentences <- corpus_segment(corpus_we_sentences, pattern = ".", valuetype = "fixed",
                           pattern_position = "after", extract_pattern = FALSE)
corpus_we_sentences <- corpus_we_sentences[ntoken(corpus_we_sentences) > 15] #remove sentences with less than 15 words (full stop counts as word), median is 24 words

#Function to randomly select MPs
sample_n_groups = function(grouped_df, size, replace = FALSE, weight=NULL) {
  grp_var <- grouped_df %>% 
    groups %>%
    unlist %>% 
    as.character
  random_grp <- grouped_df %>% 
    summarise() %>% 
    sample_n(size, replace, weight) %>% 
    mutate(unique_id = 1:NROW(.))
  grouped_df %>% 
    right_join(random_grp, by=grp_var) %>% 
    group_by_(grp_var) 
}

#Sample MPs
set.seed(35589)
sample <- cbind(docvars(corpus_we_sentences), docname = docnames(corpus_we_sentences))
sample <- sample %>% group_by(MP) %>% sample_n_groups(35) #sample 35 MPs

#Sample texts from each MP
sample <- sample %>% group_by(MP) %>% sample_n(10) # sample 10 texts per MP

#Subset corpus and export sentences
coding_sample <- corpus_subset(corpus_we_sentences, docnames(corpus_we_sentences) %in% sample$docname)
export <- cbind.data.frame(text_a = texts(coding_sample[1:175]), text_b = texts(coding_sample[176:350]))
write.csv(export, "./Justification model/coding_justification.csv")

#Read in hand-coded sentences
coded_sentences <- read_excel("./Justification model/coding_justification_COMPLETED.xlsx")

#Prepare sentences to be scored (as speeches are prepared above)
sentences_scored <- c(coded_sentences$sentence_A, coded_sentences$sentence_B) #read text from text file
for (i in 1:nrow(tocompound)) { #replace combined terms with multi-grams
  sentences_scored <- gsub(tocompound$sepwords[i], tocompound$words[i], sentences_scored, ignore.case = T) 
}
sentences_scored <- gsub("(?!_)[[:punct:]]", " ", sentences_scored, perl = TRUE) #remove punctuation except for "_"
sentences_scored <- tolower(sentences_scored) #lowercasing
sentences_scored <- stringr::str_replace_all(sentences_scored, stopwords_regex, '') #remove English stopwords
sentences_scored <- gsub(" s ", " ", sentences_scored) #remove "s" apostrophe
sentences_scored <- str_squish(sentences_scored) #strip whitespace

#Build dfm
dfm_sentences <- dfm(sentences_scored) #create dfm in quanteda
dfm_sentences <- dfm_tfidf(dfm_sentences) #weight dfm by tf-idf 
dfm_sentences <- as.data.frame(as.matrix(dfm_sentences)) #convert to data frame

#Append the scores to the dfm
dfm_sentences <- dplyr::bind_rows(dfm_sentences, scores_republican) #append scores vector
dfm_sentences <- dplyr::bind_rows(dfm_sentences, scores_pluralist) #append scores vector
dfm_sentences <- as.data.frame(nafill(dfm_sentences, fill = 0), row.names = rownames(dfm_sentences), col.names = colnames(dfm_sentences)) #replace all NAs with 0

#Calculate scores
scores_mean_rep_sentences <- rowSums(sweep(data.matrix(dfm_sentences[1:(dim(dfm_sentences)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_sentences[rownames(dfm_sentences) == "mean_rep",])), "*"), na.rm = TRUE) / rowSums(dfm_sentences[1:(dim(dfm_sentences)[1]-2),], na.rm = TRUE)
scores_mean_plu_sentences <- rowSums(sweep(data.matrix(dfm_sentences[1:(dim(dfm_sentences)[1]-2),]), MARGIN = 2, as.vector(unlist(dfm_sentences[rownames(dfm_sentences) == "mean_plu",])), "*"), na.rm = TRUE) / rowSums(dfm_sentences[1:(dim(dfm_sentences)[1]-2),], na.rm = TRUE)
scores_rep_plu_scale_sentences <- scores_mean_rep_sentences - scores_mean_plu_sentences

#Intensity scoring analysis
intensity_validation <- cbind.data.frame(scores_mean_rep_sentences, scores_mean_plu_sentences, scores_rep_plu_scale_sentences)
intensity_validation$sentences <- c(coded_sentences$sentence_A, coded_sentences$sentence_B)
intensity_validation$processed_sentences <- sentences_scored
intensity_validation$MP <- coding_sample$MP
intensity_validation$intensity_rep <- c(coded_sentences$intensity_rep_A, coded_sentences$intensity_rep_B)
intensity_validation$intensity_plu <- c(coded_sentences$intensity_plu_A, coded_sentences$intensity_plu_B)
intensity_validation$diff_intensity <- intensity_validation$intensity_rep - intensity_validation$intensity_plu

#Assess quality on sentence level - reported in the Appendix
cor(intensity_validation$intensity_rep, intensity_validation$scores_mean_rep_sentences, use = "complete.obs")
cor(intensity_validation$intensity_plu, intensity_validation$scores_mean_plu_sentences, use = "complete.obs")
cor(intensity_validation$diff_intensity, intensity_validation$scores_rep_plu_scale_sentences, use = "complete.obs")

#Aggregate to MP level
intensity_validation_aggregated <- intensity_validation %>%
  group_by(MP) %>%
  dplyr::summarize_at(c("intensity_rep", "intensity_plu", "diff_intensity", "scores_mean_rep_sentences", "scores_mean_plu_sentences", "scores_rep_plu_scale_sentences"), mean, na.rm = TRUE)

#Assess quality on MP level - reported in the Appendix
cor(intensity_validation_aggregated$intensity_rep, intensity_validation_aggregated$scores_mean_rep_sentences, use = "complete.obs")
cor(intensity_validation_aggregated$intensity_plu, intensity_validation_aggregated$scores_mean_plu_sentences, use = "complete.obs")
cor(intensity_validation_aggregated$diff_intensity, intensity_validation_aggregated$scores_rep_plu_scale_sentences, use = "complete.obs")


